
# This file generates the cumulative number of co-inventors, for each inventor in each year
# Used in future-patenting regression
# Zihao Li. 06/2024

import pandas as pd
import numpy as np

dir = r'/Volumes/Zihao_SSD2/PatentsView/'

def main():
    print('Loading dataset: g_inventor_gender_race_age.csv...')
    df = pd.read_csv(dir + 'cleandata/g_inventor_gender_race_age.csv', encoding='utf-8', low_memory=False, usecols=['patent_id', 'inventor_name', 'inventor_sequence', 'patent_year', 'inventor_id', 'race80', 'gender_09_100'])

    # Drop non-numeric patents
    print('Dropping non-numeric patents...')
    df["patent_id"] = df["patent_id"].astype(str)
    df = df[~df["patent_id"].str.contains("[a-zA-Z]")].copy()
    df["patent_id"] = pd.to_numeric(df["patent_id"], errors="raise")
    df = df.sort_values(by=['patent_id', 'inventor_id'], ascending=[True, True])

    # Calculate cumulative number of co-inventors
    def calculate_cumulative_coauthors(df):
        df_coauthors = df.groupby('patent_id')['inventor_id'].agg(list).reset_index()
        df = df.merge(df_coauthors, on='patent_id', how='left')
        coauthors_dict = {}
        
        def update_coauthors(row):
            inventor_id = row['inventor_id_x']
            patent_year = row['patent_year']
            coauthors = set(row['inventor_id_y']) - {inventor_id}
        
            if inventor_id not in coauthors_dict:
                coauthors_dict[inventor_id] = {'coauthors': set(), 'year': patent_year}

            if patent_year >= coauthors_dict[inventor_id]['year']:
                coauthors_dict[inventor_id]['coauthors'].update(coauthors)
                coauthors_dict[inventor_id]['year'] = patent_year
            
            return len(coauthors_dict[inventor_id]['coauthors'])
        
        df['cumulative_coauthors'] = df.apply(update_coauthors, axis=1)
        df_coauthors = df.groupby(['inventor_id_x', 'patent_year'])['cumulative_coauthors'].max().reset_index()
        df_coauthors = df_coauthors.rename(columns={'inventor_id_x': 'inventor_id'})
        return df_coauthors

    print('Calculating co-inventors...')
    df_coauthors = calculate_cumulative_coauthors(df)
    df_coauthors = df_coauthors[(df_coauthors['patent_year'] >= 1981) & (df_coauthors['patent_year'] <= 2015)]
    df_coauthors.rename(columns={'patent_year': 'year'}, inplace=True)

    # Export dataset
    print('Exporting dataset...')
    df_coauthors.to_stata(dir + 'temp/inventor_coauthor.dta', write_index=False)

if __name__ == "__main__":
    main()







